{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "'''\n",
    "Notebook by Alon Agmon.\n",
    "Based on Elkan & Noto \"Learning classifiers from only positive and unlabeled data.\" \n",
    "Proceeding of the 14th ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2008.\n",
    "Implementaion code by Alexandre Drouin (https://github.com/aldro61)\n",
    "'''\n",
    "\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "url = \"http://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt\"\n",
    "data = pd.read_csv(url, header=None)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load data set"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### The data set will contain 4 features and target var indicating whether the note is forged or authentic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1372, 5)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3.62160</td>\n",
       "      <td>8.6661</td>\n",
       "      <td>-2.80730</td>\n",
       "      <td>-0.44699</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4.54590</td>\n",
       "      <td>8.1674</td>\n",
       "      <td>-2.45860</td>\n",
       "      <td>-1.46210</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.86600</td>\n",
       "      <td>-2.6383</td>\n",
       "      <td>1.92420</td>\n",
       "      <td>0.10645</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3.45660</td>\n",
       "      <td>9.5228</td>\n",
       "      <td>-4.01120</td>\n",
       "      <td>-3.59440</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.32924</td>\n",
       "      <td>-4.4552</td>\n",
       "      <td>4.57180</td>\n",
       "      <td>-0.98880</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>4.36840</td>\n",
       "      <td>9.6718</td>\n",
       "      <td>-3.96060</td>\n",
       "      <td>-3.16250</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>3.59120</td>\n",
       "      <td>3.0129</td>\n",
       "      <td>0.72888</td>\n",
       "      <td>0.56421</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>2.09220</td>\n",
       "      <td>-6.8100</td>\n",
       "      <td>8.46360</td>\n",
       "      <td>-0.60216</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>3.20320</td>\n",
       "      <td>5.7588</td>\n",
       "      <td>-0.75345</td>\n",
       "      <td>-0.61251</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>1.53560</td>\n",
       "      <td>9.1772</td>\n",
       "      <td>-2.27180</td>\n",
       "      <td>-0.73535</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         0       1        2        3  4\n",
       "0  3.62160  8.6661 -2.80730 -0.44699  0\n",
       "1  4.54590  8.1674 -2.45860 -1.46210  0\n",
       "2  3.86600 -2.6383  1.92420  0.10645  0\n",
       "3  3.45660  9.5228 -4.01120 -3.59440  0\n",
       "4  0.32924 -4.4552  4.57180 -0.98880  0\n",
       "5  4.36840  9.6718 -3.96060 -3.16250  0\n",
       "6  3.59120  3.0129  0.72888  0.56421  0\n",
       "7  2.09220 -6.8100  8.46360 -0.60216  0\n",
       "8  3.20320  5.7588 -0.75345 -0.61251  0\n",
       "9  1.53560  9.1772 -2.27180 -0.73535  0"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(data.shape)\n",
    "data.head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Check how balanced the data set is in terms of postives and negatives"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    762\n",
       "1    610\n",
       "Name: 4, dtype: int64"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.iloc[:, -1].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Train a classifier to create a baseline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "x_data = data.iloc[:,:-1]\n",
    "y_data = data.iloc[:,-1]\n",
    "\n",
    "x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
       "              colsample_bynode=1, colsample_bytree=1, gamma=0,\n",
       "              learning_rate=0.1, max_delta_step=0, max_depth=3,\n",
       "              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,\n",
       "              nthread=None, objective='binary:logistic', random_state=0,\n",
       "              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,\n",
       "              silent=None, subsample=1, verbosity=1)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import xgboost as xgb\n",
    "\n",
    "model = xgb.XGBClassifier()\n",
    "\n",
    "model.fit(x_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_predict = model.predict(x_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Establish our baseline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Classification results:\n",
      "f1: 99.57%\n",
      "roc: 99.57%\n",
      "recall: 99.15%\n",
      "precision: 100.00%\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score\n",
    "\n",
    "def evaluate_results(y_test, y_predict):\n",
    "    print('Classification results:')\n",
    "    f1 = f1_score(y_test, y_predict)\n",
    "    print(\"f1: %.2f%%\" % (f1 * 100.0)) \n",
    "    roc = roc_auc_score(y_test, y_predict)\n",
    "    print(\"roc: %.2f%%\" % (roc * 100.0)) \n",
    "    rec = recall_score(y_test, y_predict, average='binary')\n",
    "    print(\"recall: %.2f%%\" % (rec * 100.0)) \n",
    "    prc = precision_score(y_test, y_predict, average='binary')\n",
    "    print(\"precision: %.2f%%\" % (prc * 100.0)) \n",
    "\n",
    "    \n",
    "evaluate_results(y_test, y_predict)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Test the PU learning approach"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Keep aside 25% of the positives -- they will be the only labeled samples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Using 153/610 as positives and unlabeling the rest\n"
     ]
    }
   ],
   "source": [
    "mod_data = data.copy()\n",
    "#get the indices of the positives samples\n",
    "pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]\n",
    "#shuffle them\n",
    "np.random.shuffle(pos_ind)\n",
    "# leave just 25% of the positives marked\n",
    "pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))\n",
    "print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')\n",
    "pos_sample = pos_ind[:pos_sample_len]\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Create the target col 'class_test' that will be 1 for postive and -1 for unlabebed "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "target variable:\n",
      " -1    1219\n",
      " 1     153\n",
      "Name: class_test, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "mod_data['class_test'] = -1\n",
    "mod_data.loc[pos_sample,'class_test'] = 1\n",
    "print('target variable:\\n', mod_data.iloc[:,-1].value_counts())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### We now have just 153 positive samples labeled as 1 in the 'class_test' col while the rest is unlabeled as -1. \n",
    "#### Recall that col 4 still holds the actual label "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>class_test</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3.62160</td>\n",
       "      <td>8.6661</td>\n",
       "      <td>-2.80730</td>\n",
       "      <td>-0.44699</td>\n",
       "      <td>0</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4.54590</td>\n",
       "      <td>8.1674</td>\n",
       "      <td>-2.45860</td>\n",
       "      <td>-1.46210</td>\n",
       "      <td>0</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.86600</td>\n",
       "      <td>-2.6383</td>\n",
       "      <td>1.92420</td>\n",
       "      <td>0.10645</td>\n",
       "      <td>0</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3.45660</td>\n",
       "      <td>9.5228</td>\n",
       "      <td>-4.01120</td>\n",
       "      <td>-3.59440</td>\n",
       "      <td>0</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.32924</td>\n",
       "      <td>-4.4552</td>\n",
       "      <td>4.57180</td>\n",
       "      <td>-0.98880</td>\n",
       "      <td>0</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>4.36840</td>\n",
       "      <td>9.6718</td>\n",
       "      <td>-3.96060</td>\n",
       "      <td>-3.16250</td>\n",
       "      <td>0</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>3.59120</td>\n",
       "      <td>3.0129</td>\n",
       "      <td>0.72888</td>\n",
       "      <td>0.56421</td>\n",
       "      <td>0</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>2.09220</td>\n",
       "      <td>-6.8100</td>\n",
       "      <td>8.46360</td>\n",
       "      <td>-0.60216</td>\n",
       "      <td>0</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>3.20320</td>\n",
       "      <td>5.7588</td>\n",
       "      <td>-0.75345</td>\n",
       "      <td>-0.61251</td>\n",
       "      <td>0</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>1.53560</td>\n",
       "      <td>9.1772</td>\n",
       "      <td>-2.27180</td>\n",
       "      <td>-0.73535</td>\n",
       "      <td>0</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         0       1        2        3  4  class_test\n",
       "0  3.62160  8.6661 -2.80730 -0.44699  0          -1\n",
       "1  4.54590  8.1674 -2.45860 -1.46210  0          -1\n",
       "2  3.86600 -2.6383  1.92420  0.10645  0          -1\n",
       "3  3.45660  9.5228 -4.01120 -3.59440  0          -1\n",
       "4  0.32924 -4.4552  4.57180 -0.98880  0          -1\n",
       "5  4.36840  9.6718 -3.96060 -3.16250  0          -1\n",
       "6  3.59120  3.0129  0.72888  0.56421  0          -1\n",
       "7  2.09220 -6.8100  8.46360 -0.60216  0          -1\n",
       "8  3.20320  5.7588 -0.75345 -0.61251  0          -1\n",
       "9  1.53560  9.1772 -2.27180 -0.73535  0          -1"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mod_data.head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Remember that this data frame (x_data) includes the former target variable that we keep here just to compare the results\n",
    "[:-2] is the original class label for positive and negative data\n",
    "[:-1] is the new class for positive and unlabeled data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_data = mod_data.iloc[:,:-2].values # just the X \n",
    "y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)\n",
    "y_positive = mod_data.iloc[:,-2].values # original class"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### The training set will be divided into a fitting-set that will be used to fit the estimator in order to estimate P(s=1|X) and a held-out set of positive samples that will be used to estimate P(s=1|y=1)\n",
    "   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def fit_PU_estimator(X,y, hold_out_ratio, estimator):\n",
    "    \n",
    "    # find the indices of the positive/labeled elements\n",
    "    assert (type(y) == np.ndarray), \"Must pass np.ndarray rather than list as y\"\n",
    "    positives = np.where(y == 1.)[0] \n",
    "    # hold_out_size = the *number* of positives/labeled samples \n",
    "    # that we will use later to estimate P(s=1|y=1)\n",
    "    hold_out_size = int(np.ceil(len(positives) * hold_out_ratio))\n",
    "    np.random.shuffle(positives)\n",
    "    # hold_out = the *indices* of the positive elements \n",
    "    # that we will later use  to estimate P(s=1|y=1)\n",
    "    hold_out = positives[:hold_out_size] \n",
    "    # the actual positive *elements* that we will keep aside\n",
    "    X_hold_out = X[hold_out] \n",
    "    # remove the held out elements from X and y\n",
    "    X = np.delete(X, hold_out,0) \n",
    "    y = np.delete(y, hold_out)\n",
    "    # We fit the estimator on the unlabeled samples + (part of the) positive and labeled ones.\n",
    "    # In order to estimate P(s=1|X) or  what is the probablity that an element is *labeled*\n",
    "    estimator.fit(X, y)\n",
    "    # We then use the estimator for prediction of the positive held-out set \n",
    "    # in order to estimate P(s=1|y=1)\n",
    "    hold_out_predictions = estimator.predict_proba(X_hold_out)\n",
    "    #take the probability that it is 1\n",
    "    hold_out_predictions = hold_out_predictions[:,1]\n",
    "    # save the mean probability \n",
    "    c = np.mean(hold_out_predictions)\n",
    "    return estimator, c\n",
    "\n",
    "def predict_PU_prob(X, estimator, prob_s1y1):\n",
    "    predicted_s = estimator.predict_proba(X)\n",
    "    predicted_s = predicted_s[:,1]\n",
    "    return predicted_s / prob_s1y1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### test the PU estimation approach"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Learning Iteration::0/24 => P(s=1|y=1)=0.20000000298023224\n",
      "Learning Iteration::4/24 => P(s=1|y=1)=0.20000000298023224\n",
      "Learning Iteration::8/24 => P(s=1|y=1)=0.20000000298023224\n",
      "Learning Iteration::12/24 => P(s=1|y=1)=0.20000000298023224\n",
      "Learning Iteration::16/24 => P(s=1|y=1)=0.18000000715255737\n",
      "Learning Iteration::20/24 => P(s=1|y=1)=0.1899999976158142\n"
     ]
    }
   ],
   "source": [
    "predicted = np.zeros(len(x_data))\n",
    "learning_iterations = 24\n",
    "for index in range(learning_iterations):\n",
    "    pu_estimator, probs1y1 = fit_PU_estimator(x_data, y_labeled, 0.2, xgb.XGBClassifier())\n",
    "    predicted += predict_PU_prob(x_data, pu_estimator, probs1y1)\n",
    "    if(index%4 == 0): \n",
    "        print(f'Learning Iteration::{index}/{learning_iterations} => P(s=1|y=1)={round(probs1y1,2)}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### compare the performance of the predictions of the PU approacj (y_predict) with the actuall original classes (y_positive) that we have saved aside"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Classification results:\n",
      "f1: 94.20%\n",
      "roc: 94.52%\n",
      "recall: 89.18%\n",
      "precision: 99.82%\n"
     ]
    }
   ],
   "source": [
    "y_predict = [1 if x > 0.5 else 0 for x in (predicted/learning_iterations)]\n",
    "evaluate_results(y_positive, y_predict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.7.5 64-bit ('NEWDS': conda)",
   "language": "python",
   "name": "python37564bitnewdscondaa8691905f8854f9c935382783aaefb07"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
